{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNmF+zC9jZOUPQc3iK0CVHx",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Huangjian2013/ai-demo/blob/main/rag/10-LangChain-pdf-rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N7rbJ7CbVUC8"
      },
      "outputs": [],
      "source": [
        "!pip install langchain --quiet\n",
        "!pip install langchain-openai --quiet\n",
        "!pip install langchain-community --quiet\n",
        "!pip install pypdf --quiet\n",
        "!pip install chromadb --quiet\n",
        "!pip install tiktoken --quiet"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import userdata\n",
        "from langchain_openai import ChatOpenAI\n",
        "from langchain.document_loaders import PyPDFLoader\n",
        "from langchain.prompts import ChatPromptTemplate\n",
        "from langchain.schema.output_parser import StrOutputParser\n",
        "from langchain.text_splitter import CharacterTextSplitter\n",
        "from langchain.vectorstores import Chroma\n",
        "from langchain.embeddings import OpenAIEmbeddings\n",
        "from langchain.chains import RetrievalQA"
      ],
      "metadata": {
        "id": "pBXoK4AOXiZU"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = ChatOpenAI(api_key = userdata.get('REAL_OPENAI_KEY'))"
      ],
      "metadata": {
        "id": "xIQszadqXmUv"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "loader = PyPDFLoader(\"sample_data/极越01.pdf\")\n",
        "\n",
        "documents = loader.load();"
      ],
      "metadata": {
        "id": "KoaluuZWQ5iC"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
        "docs = text_splitter.split_documents(documents)"
      ],
      "metadata": {
        "id": "1q0ZR3MiQ6a6"
      },
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "embeddings = OpenAIEmbeddings(api_key = userdata.get('REAL_OPENAI_KEY'))\n",
        "docsearch = Chroma.from_documents(docs, embeddings)"
      ],
      "metadata": {
        "id": "yCv3fxGmQ8b0"
      },
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "retriever=docsearch.as_retriever()\n",
        "\n",
        "question = \"极越01车型尺寸\"\n",
        "\n",
        "retrieved_documents = retriever.get_relevant_documents(question)\n",
        "\n",
        "for document in retrieved_documents:\n",
        "    print(\"-------------------------------------\")\n",
        "    print(document.page_content)\n"
      ],
      "metadata": {
        "id": "OTHfgoodtDlb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "qa = RetrievalQA.from_chain_type(llm=model, chain_type=\"stuff\", retriever = retriever)\n",
        "\n",
        "qa.invoke(question)"
      ],
      "metadata": {
        "id": "-Xs_lrQJQw-p"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}